import requests
from bs4 import BeautifulSoup
import re

# 目标网站URL
url = 'https://cvpr.thecvf.com/Conferences/2024/AcceptedPapers'

# 发送HTTP请求获取网页内容
response = requests.get(url)

# 使用BeautifulSoup解析网页内容
soup = BeautifulSoup(response.text, 'html.parser')

# 提取网页中的所有文字内容
text_contents = [p.get_text() for p in soup.find_all('p')]

# 清理文本内容
cleaned_text = []
for line in text_contents:
    # 去除多余的空格和换行
    line = line.strip()
    # 合并一行中的连续空格
    line = re.sub('\s+', ' ', line)
    # 如果处理后的行不为空，则添加到清理后的文本列表中
    if line:
        cleaned_text.append(line)

# 将清理后的文本内容按行保存到txt文件中
with open('data/accepted_papers_cleaned.txt', 'w', encoding='utf-8') as f:
    for line in cleaned_text:
        f.write(f"{line}\n")

print('Cleaned text content has been saved to accepted_papers_cleaned.txt')
